{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "### case 1 : using label\n",
    "### case 2 : using G cloud speech api"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 01 create voca from transcription"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import sys\n",
    "import csv\n",
    "import pickle\n",
    "import numpy as np\n",
    "from nlp_util import *\n",
    "from nltk.tokenize import word_tokenize"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "128\n",
      "1\n",
      "14.819005877079391\n",
      "12.20187522097008\n"
     ]
    }
   ],
   "source": [
    "lines = []\n",
    "with open('../data/processed/IEMOCAP/processed_tran.csv') as f:\n",
    "# with open('../data/processed/IEMOCAP/processed_tran_fromG.csv') as f:\n",
    "    read = csv.reader(f)\n",
    "    lines = [ x[1] for x in read]\n",
    "    \n",
    "token_lines = [ word_tokenize(x) for x in lines]\n",
    "token_lines_lower = [ [t.lower() for t in x] for x in token_lines]\n",
    "\n",
    "sent_len = [ len(x) for x in token_lines]\n",
    "print np.max(sent_len)\n",
    "print np.min(sent_len)\n",
    "print np.mean(sent_len)\n",
    "print np.std(sent_len)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "def read_data(dic, lines):\n",
    "\n",
    "    for tokens in lines:\n",
    "        \n",
    "        for token in tokens:\n",
    "            token = token.lower()\n",
    "            \n",
    "            if token in dic :\n",
    "                dic[token] += 1\n",
    "            else:\n",
    "                dic[token] = 1\n",
    "        \n",
    "    return dic"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "dic size : 3745\n"
     ]
    }
   ],
   "source": [
    "dic_count = {}\n",
    "dic_count = read_data(dic_count, token_lines_lower)\n",
    "print 'dic size : ' + str(len(dic_count))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# not required - already small dictionary\n",
    "# dic_count = apply_mincut(dic_count, 0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "3747\n",
      "0\n"
     ]
    }
   ],
   "source": [
    "dic = {}\n",
    "dic['_PAD_'] = len(dic)\n",
    "dic['_UNK_'] = len(dic)\n",
    "\n",
    "for word in dic_count.keys():\n",
    "    dic[word] = len(dic)    \n",
    "print len(dic)\n",
    "print dic['_PAD_']\n",
    "\n",
    "with open('../data/processed/IEMOCAP/dic.pkl', 'w') as f:\n",
    "# with open('../data/processed/IEMOCAP/dic_G.pkl', 'w') as f:\n",
    "    pickle.dump( dic, f )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 02 create indexed data with voca"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "128\n",
      "1\n",
      "14.819005877079391\n",
      "12.20187522097008\n"
     ]
    }
   ],
   "source": [
    "lines = []\n",
    "with open('../data/processed/IEMOCAP/processed_tran.csv') as f:\n",
    "# with open('../data/processed/IEMOCAP/processed_tran_fromG.csv') as f:\n",
    "    read = csv.reader(f)\n",
    "    lines = [ x[1] for x in read]\n",
    "    \n",
    "token_lines = [ word_tokenize(x) for x in lines]\n",
    "token_lines_lower = [ [t.lower() for t in x] for x in token_lines]\n",
    "\n",
    "sent_len = [ len(x) for x in token_lines]\n",
    "print np.max(sent_len)\n",
    "print np.min(sent_len)\n",
    "print np.mean(sent_len)\n",
    "print np.std(sent_len)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "# dic = {}\n",
    "# with open('../data/processed/IEMOCAP/dic_G.pkl') as f:\n",
    "#     dic = pickle.load(f)\n",
    "# print dic['_PAD_']\n",
    "# print len(dic)    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "# convert to index\n",
    "index_lines = [ [ dic[t] for t in x ] for x in token_lines_lower ]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 03 save as numpy\n",
    "- max length = 128 (100% coverage)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(10039, 128)"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np_trans = np.zeros( [10039, 128], dtype=np.int)\n",
    "np.shape(np_trans)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "for i in xrange( len(index_lines) ):\n",
    "    \n",
    "    if len( index_lines[i] ) > 127:\n",
    "        np_trans[i][:] = index_lines[i][:128]\n",
    "    else:\n",
    "        np_trans[i][:len(index_lines[i])] = index_lines[i][:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.save('../data/processed/IEMOCAP/processed_trans.npy', np_trans)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(10039, 128)"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.shape(np_trans)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 03 save as numpy - G case - ues it when ready\n",
    "- max length = 128 (100% coverage)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "# len(index_lines)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "# np_trans = np.zeros( [5531, 128], dtype=np.int)\n",
    "# np.shape(np_trans)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "# for i in xrange( len(index_lines) ):\n",
    "    \n",
    "#     if len( index_lines[i] ) > 127:\n",
    "#         np_trans[i][:] = index_lines[i][:128]\n",
    "#     else:\n",
    "#         np_trans[i][:len(index_lines[i])] = index_lines[i][:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "# np.save('../data/processed/IEMOCAP/processed_trans_G.npy', np_trans)\n",
    "# np.shape(np_trans)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [conda env:tf14_p27]",
   "language": "python",
   "name": "conda-env-tf14_p27-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
